{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "import IPython.display as ipd\n",
    "\n",
    "import os\n",
    "import json\n",
    "import math\n",
    "import torch\n",
    "from torch import nn\n",
    "from torch.nn import functional as F\n",
    "from torch.utils.data import DataLoader\n",
    "\n",
    "import commons\n",
    "import utils\n",
    "from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate\n",
    "from models import SynthesizerTrn\n",
    "from text.symbols import symbols\n",
    "from text import text_to_sequence\n",
    "\n",
    "from scipy.io.wavfile import write\n",
    "\n",
    "\n",
    "def get_text(text, hps):\n",
    "    text_norm = text_to_sequence(text, hps.data.text_cleaners)\n",
    "    if hps.data.add_blank:\n",
    "        text_norm = commons.intersperse(text_norm, 0)\n",
    "    text_norm = torch.LongTensor(text_norm)\n",
    "    return text_norm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## MB-iSTFT-VITS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hps = utils.get_hparams_from_file(\"./configs/ljs_mb_istft_vits.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "net_g = SynthesizerTrn(\n",
    "    len(symbols),\n",
    "    hps.data.filter_length // 2 + 1,\n",
    "    hps.train.segment_size // hps.data.hop_length,\n",
    "    **hps.model).cuda()\n",
    "_ = net_g.eval()\n",
    "\n",
    "_ = utils.load_checkpoint(\"./logs/ljs_mb_istft_vits/G_800000.pth\", net_g, None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "stn_tst = get_text(\"This is a sample audio\", hps)\n",
    "with torch.no_grad():\n",
    "    x_tst = stn_tst.cuda().unsqueeze(0)\n",
    "    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()\n",
    "    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()\n",
    "ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
